Expression table

####################################
#Preparing the expression table
####################################

## Set working directory for first rna_batch of data and import rsem data ##
#dir = "/sc/hydra/projects/pd-omics/mynd/data/PBG-2097"
#samples = read.table("/sc/hydra/projects/pd-omics/mynd/qc/samples.rna_batch1", header = F, check.names = F)
#sample = samples$V1
#files = file.path(dir, sample, "Processed/RAPiD/rsem", paste0(sample, ".genes.results"))
#names(files) = sample
#txi.rsem = tximport(files, type = "rsem")
#counts1 = data.frame(txi.rsem$counts, check.names = F)
#tpms1 = data.frame(txi.rsem$abundance, check.names = F)

## Repeat for the second rna_batch of data ##
#dir = "/sc/hydra/projects/pd-omics/mynd/data/PBG-2022"
#samples = read.table("/sc/hydra/projects/pd-omics/mynd/qc/samples.rna_batch2", header = F, check.names = F)
#sample = samples$V1
#files = file.path(dir, sample, "Processed/RAPiD/rsem", paste0(sample, ".genes.results"))
#names(files) = sample
#txi.rsem1 = tximport(files, type = "rsem")
#counts2 = data.frame(txi.rsem1$counts, check.names = F)
#tpms2 = data.frame(txi.rsem1$abundance, check.names = F)

## We need to change sample names because of overlap between rna_batches. We will append the rna_batch name to the sample ID in the count matrix and then merge the two datasets ##

#colnames(counts1) <- paste(colnames(counts1), "1", sep = "_")
#colnames(tpms1) <- paste(colnames(tpms1), "1", sep = "_")
#colnames(counts2) <- paste(colnames(counts2), "2", sep = "_")
#colnames(tpms2) <- paste(colnames(tpms2), "2", sep = "_")
#dim(counts1)
#dim(counts2)
#dim(tpms1)
#dim(tpms2)
#tot_counts = merge(counts1, counts2, by = 0)
#rownames(tot_counts) = tot_counts$Row.names
#tot_counts$Row.names = NULL
#tot_tpms = merge(tpms1, tpms2, by = 0)
#rownames(tot_tpms) = tot_tpms$Row.names
#tot_tpms$Row.names = NULL
#dim(tot_tpms)
#dim(tot_counts)
#counts = tot_counts
counts = read.table('/sc/hydra/projects/pd-omics/mynd/qc/v1/mynd.filtered.counts.txt', header = T, check.names = F)
#tpms = tot_tpms
tpms = read.table('/sc/hydra/projects/pd-omics/mynd/qc/v1/mynd.filtered.tpms.txt', header = T, check.names = F)

## We need to then do the same for metadata, but rna_batch 1 and 2 are rnaseq rna_batch 1, and rna_batch 3 here, is rna_batch2 ... ##
mynd_metadata = read.table('/sc/hydra/projects/pd-omics/mynd/qc/v1/filtered_metadata.txt', row.names = 1,  header = T, check.names = F)
#rownames(mynd_metadata) = paste(mynd_metadata$Sample, mynd_metadata$rna_batch, sep = "_")
#rownames(mynd_metadata) = gsub("_2", "_1", rownames(mynd_metadata)) 
#rownames(mynd_metadata) = gsub("_3", "_2", rownames(mynd_metadata))
#rownames(mynd_metadata) %in% colnames(tpms)
mynd_metadata = mynd_metadata[colnames(counts),]
mynd_metadata$rna_batch = as.factor(mynd_metadata$rna_batch)
mynd_metadata$library_batch = as.factor(mynd_metadata$library_batch)

Normalization

##Exploratory Analysis ### Library sizes

[1] 244

Density plot

#Density plot with all samples
# cpm table = ppmi_cpm_case_control
# log cpm table = lcpm

lcounts = log2(mynd_counts)
lcpm = log2(mynd_cpm)
ltpm = log2(mynd_abundance)
# voom use log already

nsamples <- ncol(mynd_cpm)
#samplenames <- colnames(mynd_cpm)

colfunc <- colorRampPalette(c("#4DBBD5FF", "#3C5488FF"))
col = alpha(colfunc(nsamples), alpha = 0.1)

#png(paste0(work_plots, "Density_all.#png"), width = 8, height = 6, res = 300, units = "in")
par(mfrow=c(2,2))
#COUNTS
plot(density(lcounts[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="A. COUNTS ", xlab="Log2(counts)")
#abline(v=lcpm.cutoff, lty=3)
for (i in 2:nsamples){
  den <- density(lcounts[,i])
  lines(den$x, den$y, col=col[i], lwd=2)
}
#legend("right", samplenames, text.col=col, bty="n")

#CPM
plot(density(lcpm[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="B. CPM ", xlab="Log2(CPM)")
#abline(v=lcpm.cutoff, lty=3)
for (i in 2:nsamples){
  den <- density(lcpm[,i])
  lines(den$x, den$y, col=col[i], lwd=2)
}
#legend("right", samplenames, text.col=col, bty="n")

#TPM
plot(density(ltpm[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="C. TPM ", xlab="Log2(TPM)")
#abline(v=ltpm.cutoff, lty=3)
for (i in 2:nsamples){
  den <- density(ltpm[,i])
  lines(den$x, den$y, col=col[i], lwd=2)
}
#legend("right", samplenames, text.col=col, bty="n")

#Voom
plot(density(mynd_voom[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="D. VOOM ", xlab="voom")
#abline(v=lvoom.cutoff, lty=3)
for (i in 2:nsamples){
  den <- density(mynd_voom[,i])
  lines(den$x, den$y, col=col[i], lwd=2)
}
#legend("right", samplenames, text.col=col, bty="n")
#dev.off()